"""
Semantic Scholar PDF Downloader

Downloads PDFs from Semantic Scholar using their API to find open access PDFs.
"""

import re
from typing import Optional
from urllib.parse import urlparse

import requests
from loguru import logger

from .base import BaseDownloader, ContentType, DownloadResult


class SemanticScholarDownloader(BaseDownloader):
    """Downloader for Semantic Scholar papers with open access PDF support."""

    def __init__(self, timeout: int = 30, api_key: Optional[str] = None):
        """
        Initialize Semantic Scholar downloader.

        Args:
            timeout: Request timeout in seconds
            api_key: Optional Semantic Scholar API key for higher rate limits
        """
        super().__init__(timeout)
        self.api_key = api_key
        self.base_api_url = "https://api.semanticscholar.org/graph/v1"

    def can_handle(self, url: str) -> bool:
        """Check if URL is from Semantic Scholar."""
        try:
            hostname = urlparse(url).hostname
            return bool(
                hostname
                and (
                    hostname == "semanticscholar.org"
                    or hostname.endswith(".semanticscholar.org")
                )
            )
        except (ValueError, AttributeError, TypeError):
            return False

    def download(
        self, url: str, content_type: ContentType = ContentType.PDF
    ) -> Optional[bytes]:
        """Download content from Semantic Scholar."""
        result = self.download_with_result(url, content_type)
        return result.content if result.is_success else None

    def download_with_result(
        self, url: str, content_type: ContentType = ContentType.PDF
    ) -> DownloadResult:
        """Download PDF and return detailed result with skip reason."""
        # Only support PDF downloads for now
        if content_type != ContentType.PDF:
            return DownloadResult(
                skip_reason="Text extraction not yet supported for Semantic Scholar"
            )

        # Extract paper ID from URL
        paper_id = self._extract_paper_id(url)
        if not paper_id:
            return DownloadResult(
                skip_reason="Invalid Semantic Scholar URL - could not extract paper ID"
            )

        logger.info(f"Looking up Semantic Scholar paper: {paper_id}")

        # Get paper details from API to find PDF URL
        pdf_url = self._get_pdf_url(paper_id)

        if not pdf_url:
            return DownloadResult(
                skip_reason="Not open access - subscription required"
            )

        # Download the PDF from the open access URL
        logger.info(f"Downloading open access PDF from: {pdf_url}")
        pdf_content = super()._download_pdf(pdf_url)

        if pdf_content:
            return DownloadResult(content=pdf_content, is_success=True)
        else:
            return DownloadResult(
                skip_reason="Open access PDF URL found but download failed"
            )

    def _extract_paper_id(self, url: str) -> Optional[str]:
        """
        Extract Semantic Scholar paper ID from URL.

        Handles formats like:
        - https://www.semanticscholar.org/paper/abc123...
        - https://www.semanticscholar.org/paper/Title-Here/abc123...

        Returns:
            Paper ID (hash) or None if not found
        """
        # Use urlparse for more robust URL handling (handles query strings, fragments)
        parsed = urlparse(url)
        if not parsed.netloc or "semanticscholar.org" not in parsed.netloc:
            return None

        # Extract paper ID from path (40 character hex hash)
        # Handles /paper/{hash} or /paper/{title}/{hash}
        path = parsed.path
        match = re.search(r"/paper/(?:[^/]+/)?([a-f0-9]{40})", path)
        return match.group(1) if match else None

    def _get_pdf_url(self, paper_id: str) -> Optional[str]:
        """
        Get open access PDF URL from Semantic Scholar API.

        Args:
            paper_id: Semantic Scholar paper ID (hash)

        Returns:
            PDF URL if available, None otherwise
        """
        try:
            # Construct API request
            api_url = f"{self.base_api_url}/paper/{paper_id}"
            params = {"fields": "openAccessPdf"}

            # Add API key header if available
            headers = {}
            if self.api_key:
                headers["x-api-key"] = self.api_key

            # Make API request
            response = self.session.get(
                api_url, params=params, headers=headers, timeout=self.timeout
            )

            if response.status_code == 200:
                data = response.json()

                # Extract PDF URL from openAccessPdf field
                open_access_pdf = data.get("openAccessPdf")
                if open_access_pdf and isinstance(open_access_pdf, dict):
                    pdf_url = open_access_pdf.get("url")
                    if pdf_url:
                        logger.info(
                            f"Found open access PDF for paper {paper_id}: {pdf_url}"
                        )
                        return pdf_url

                logger.info(
                    f"No open access PDF available for paper {paper_id}"
                )
                return None

            elif response.status_code == 404:
                logger.warning(
                    f"Paper not found in Semantic Scholar: {paper_id}"
                )
                return None
            else:
                logger.warning(
                    f"Semantic Scholar API error: {response.status_code}"
                )
                return None

        except requests.exceptions.RequestException:
            logger.exception("Failed to query Semantic Scholar API")
            return None
        except ValueError:
            # JSON decode errors are expected runtime errors
            logger.exception("Failed to parse Semantic Scholar API response")
            return None
        # Note: KeyError and TypeError are not caught - they indicate programming
        # bugs that should propagate for debugging
